library(tidyverse)
library(sf)
library(raster)# Load the merged Satellite image for Kazanlak Valley, Bulgaria
# use cds-spatial repo from github https://github.com/CDS-AU-DK/cds-spatial
kaz <-brick("../1_Teaching/cds-spatial/data/Kaz.tif")
plotRGB(kaz, stretch= "lin")crs(kaz)## CRS arguments:
## +proj=utm +zone=35 +datum=WGS84 +units=m +no_defs
# Load prediction data as points (left bottom corner of the evaluated cell)
cnne_df <- read_csv("2021-10-25.predictions/results/east/east.csv")
# 15334 points (origins in the raster cells)
cnnw_df <- read_csv("2021-10-25.predictions/results/west/west.csv")
# 15334 points (origins in the raster cells)
cnn_df <- rbind(cnne_df, cnnw_df)
# 30504 rows
# Check the probabilities of there being a mound in a given cell
hist(cnn_df$mound_probability,
main = "Probability of mound present in a satellite image gridcell");abline(v=0.6, col="red", lty = 2, lwd = 3)table(cut(cnn_df$mound_probability, c(0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1)))##
## (0,0.1] (0.1,0.2] (0.2,0.3] (0.3,0.4] (0.4,0.5] (0.5,0.6] (0.6,0.7] (0.7,0.8]
## 7865 7710 6983 4757 2243 653 195 58
## (0.8,0.9] (0.9,1]
## 30 10
# ok, so there is an exponential drop-off. Let’s make the predictions to a grid so we can assess whether gridcells with higher probabilities of mound presence in fact contain burial mounds. To keep the grid lightweight, let’s start with those gridcells that have values of mound probability at 60% and higher.
### Build a grid of those with 60%+ probability of containing a mound
# Build a grid from all points
#cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")
# Filter predictions to those that have 60+% likelihood of containing a mound
cnn60_sp <- cnn_df %>%
filter(mound_probability > 0.59) %>% #333 observations
st_as_sf(coords = c("x","y"), crs = 32635)
# Make a grid of 60%+ cells, 382 cells, 332 unique ones
cnn_grid <- st_make_grid(cnn60_sp, cellsize = 250, what = "polygons")
#plot(cnn_grid)
# Add probability data to the 60%+ grid
#cnnall_datagrid <- st_join(st_sf(cnnall_grid), cnnall_sp)
cnn_grid <- st_join(st_sf(cnn_grid), cnn60_sp)
# Visualize the grid cells with higher probability
ggplot(cnn_grid) +
geom_sf(aes(color = mound_probability))# 333 raster cells are predicted to contain mounds with greater
# than 60% likelihood. Archaeologists found 773 mounds in fraction of the area
# View the grid
plotRGB(kaz, stretch= "lin");
plot(cnn_grid$geometry, add = TRUE, border = "white")that we will need later for validation
# Bring in all the mounds observed in the field
mounds <- st_read("../1_Teaching/cds-spatial/data/KAZ_mounds.shp")## Reading layer `KAZ_mounds' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_mounds.shp' using driver `ESRI Shapefile'
## Simple feature collection with 773 features and 5 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS: WGS 84 / UTM zone 35N
plot(mounds$geometry, main = "Mounds found through survey")# Bring in survey area to see overall coverage
survey <- st_read("../1_Teaching/cds-spatial/data/KAZ_surveyarea.shp")## Reading layer `KAZ_surveyarea' from data source `C:\Users\Adela\Documents\RStudio\1_Teaching\cds-spatial\data\KAZ_surveyarea.shp' using driver `ESRI Shapefile'
## Simple feature collection with 98 features and 5 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 352181.7 ymin: 4712923 xmax: 368890.2 ymax: 4733359
## projected CRS: WGS 84 / UTM zone 35N
plot(survey$geometry, main = "Area covered by survey")# Extrapolate rough study area after subtracting outliers
far <- mounds %>%
st_is_within_distance(survey, 1500) %>%
lengths>0
farmounds <- which(far==0) # these are the mounds that are mostly far from survey area
# convex hull of points without outliers
survey_sm <- st_convex_hull(st_union(mounds$geometry[-farmounds]))
# convex hull of survey polygons
survey_ch <- st_convex_hull(st_union(survey$geometry))
# quickly see the difference in bounding boxes
plot(survey_ch, col = "green", main = "Convex hull around survey area (tight and extensive)");plot(survey_sm, col = "red", alpha=0.5, add = TRUE); plot(survey$geometry, add =TRUE)# View all mounds
plotRGB(kaz, stretch= "lin");
plot(survey_ch, col = "green", add = TRUE);
plot(survey_sm, col = "red", add = TRUE);
plot(survey$geometry, col = "lightyellow", add = TRUE );
plot(mounds$geometry[-farmounds,], add = TRUE, col = "pink");
plot(cnn_grid$geometry, add = TRUE, border = "white");
legend("bottom", legend = "Grid cells with 60%+ probability of mounds (white squares) overlayed on \nthe satellite image and survey study area outlines with mounds (pink circles)")The Tundzha Regional Archaeological project surveyed 85 sq km of the area included in the satellite imagery in 2009-2011, documenting the burial mounds within which create a dataset suitable for validation.
Area surveyed contains 773 documented mounds of various shapes and sizes. This area also contains 192 of 332 grid cells of 250m a side which were allocated 0.6+ probability of containing a mound. Let’s explore how many mounds are outside these grid cells and which mounds were predicted and which not. Bring in mound attributes and check correlation. Does size play a role?
# Bring in mound attributes (n = 773) to aid exploration
mounddata <- read_csv("../1_Teaching/cds-spatial/data/KAZ_mdata.csv")
mounds <- mounds %>%
left_join(mounddata, by = c("TRAP_Code"="MoundID"))Let’s see which of the grid cells with high probability of containing a mound are in the TRAP study area?
# Select gridcells that fall within TRAP study area
survey_grids60 <- st_intersection(survey_ch, cnn_grid) # 192 grid cells with 60%+
#plot(survey_grids60)Archaeological survey documented 773 mounds in the study area. Mounds from this dataset that fall in the 60%+ probability gridcells form the true positives and comprise the success of the CNN model.
# Check spatial overlap between 60%+ grid cells and all 773 mounds
predicted <- st_intersection(mounds, cnn_grid)# %>% distinct()
# Which ones are not-predicted?
`%nin%` = Negate(`%in%`)
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,] # 627 not predicted
head(predicted,2) #146 unique features among 166 features caught via intersecton## Simple feature collection with 2 features and 11 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 360896.8 ymin: 4713055 xmax: 361446.8 ymax: 4713591
## projected CRS: WGS 84 / UTM zone 35N
## Notes TRAP_Code Cluster Lat Long Condition Robbed Height LandUse
## 82 Mound 4066 3 42.55737 25.30552 3 1 5 Pasture
## 83 Mound 4067 3 42.56229 25.31209 2 1 2 Pasture
## X1 mound_probability geometry
## 82 4748 0.8855298 POINT (360896.8 4713055)
## 83 5499 0.7194230 POINT (361446.8 4713591)
length(unique(predicted$TRAP_Code)) # 146 unique mounds## [1] 146
length(unique(cnn_grid$X1)) #332 unique cells## [1] 332
grids_n_mounds <- predicted %>%
# st_drop_geometry() %>%
group_by(X1) %>%
count()
length(unique(grids_n_mounds$X1)) # all the detected mounds are withing 51 grids, out of which one is labelled NA, so 50 it is## [1] 50
hist(grids_n_mounds$n, breaks = 40, main = "Frequency of predicted mounds (146) per grid cell(n = 50")# Summary: 50 out of 332 Grid cells with 60%+ probability contain between 1 and 29 detected mounds, which makes sense for the necropolis of little 10m diameter mounds in the NWThe previous chunk shows that 146 of 773 unique mounds appear in 51 of the 192 “survey” grids that have 60% or higher probability of mounds and are at least partially within the TRAP study area. The intersection actually shows 166 points, but some fall on the border of two gridcells and are double-counted. The entire satellite image contains 332 grids with 60%+ probability of mounds, 192 fall in the TRAP study area and 50 of these contain 146 mounds.
# Add the information on prediction to mound dataset
mounds <- mounds %>%
mutate(predicted60 = case_when(TRAP_Code %in% predicted$TRAP_Code ~ "yes",
TRAP_Code %nin% predicted$TRAP_Code ~ "no"
))
# Look at the predicted and un=predicted mounds
unpredicted <- mounds[mounds$TRAP_Code%nin%predicted$TRAP_Code,]
plot(unpredicted$geometry, col = "red", main = "Mounds predicted (green) and unpredicted (red) by CNN");plot(predicted$geometry, col = "darkgreen", add= TRUE); Let’s look at whether mound Height is a factor impacting detectability. (it is a factor for visual inspection as higher mounds are more visible in sat img.)
# filter the mounds for largesh and noticeable ones (2+ meters)
large <- mounds %>%
filter(Height>=2) # 247 obs
mounds %>%
group_by(predicted60) %>%
summarize(min = min(Height), max = max(Height))## Simple feature collection with 2 features and 3 fields
## geometry type: MULTIPOINT
## dimension: XY
## bbox: xmin: 352481.3 ymin: 4712325 xmax: 371282.4 ymax: 4730029
## projected CRS: WGS 84 / UTM zone 35N
## # A tibble: 2 x 4
## predicted60 min max geometry
## <chr> <dbl> <dbl> <MULTIPOINT [m]>
## 1 no 0 20 ((352505.2 4725060), (352519.3 4725007), (352536.4 47~
## 2 yes 0.1 10 ((352481.3 4724776), (352488.8 4724747), (352489.3 47~
# Check the height difference between un- and predicted mounds
boxplot(Height~predicted60, data = mounds, xlab = "Was mound detected?")# Check height distribution btw un- and predicted mounds
hist(mounds$Height[mounds$predicted60 == "no"], col = "pink", breaks = 20,main = "Histogram of detected (yellow) and undetected (pink) mounds by height",
xlab = "Height (m)");
hist(mounds$Height[mounds$predicted60 == "yes"], col = "yellow", add =TRUE, alpha = 0.5);# Is there a significant relationship between the Height of a mound and its detection?
t.test(Height ~ as.factor(predicted60), data = mounds)##
## Welch Two Sample t-test
##
## data: Height by as.factor(predicted60)
## t = 4.2302, df = 289.8, p-value = 3.136e-05
## alternative hypothesis: true difference in means between group no and group yes is not equal to 0
## 95 percent confidence interval:
## 0.3825377 1.0482416
## sample estimates:
## mean in group no mean in group yes
## 1.877033 1.161644
str(t.test(Height ~ as.factor(predicted60), data = mounds))## List of 10
## $ statistic : Named num 4.23
## ..- attr(*, "names")= chr "t"
## $ parameter : Named num 290
## ..- attr(*, "names")= chr "df"
## $ p.value : num 3.14e-05
## $ conf.int : num [1:2] 0.383 1.048
## ..- attr(*, "conf.level")= num 0.95
## $ estimate : Named num [1:2] 1.88 1.16
## ..- attr(*, "names")= chr [1:2] "mean in group no" "mean in group yes"
## $ null.value : Named num 0
## ..- attr(*, "names")= chr "difference in means between group no and group yes"
## $ stderr : num 0.169
## $ alternative: chr "two.sided"
## $ method : chr "Welch Two Sample t-test"
## $ data.name : chr "Height by as.factor(predicted60)"
## - attr(*, "class")= chr "htest"
# ... the answer seems to be yes with p at 3.14e-05Given 146 mounds were predicted by CNN, where are the remaining 627? Let’s explore where the 627 missing mounds are and what probability has been assigned to the cells that contain them by CNN model.
# Create a grid of ALL the predictions to see where the remaining 627 unpredicted mounds are
cnnall_sp <- st_as_sf(cnn_df, coords = c("x","y"), crs = 32635)
#plot(cnnall_sp)
cnnall_grid <- st_make_grid(cnnall_sp, cellsize = 250, what = "polygons")
#plot(cnnall_grid) # we can see that the cells overlap in the middle
cnnall_grid <- st_join(st_sf(cnnall_grid), cnnall_sp) # add attributes
# Trying to eliminate overlap in gridcells
# eliminating id duplicates is nonsensical as the two datasets have same ids, I must aggregate spatially, e.g. st_union or st_difference, or relabel the datasets before merging, or crop to a shared non-overlapping border.
missing <- st_intersection(unpredicted, cnnall_grid) # 2158, clearly duplicates
# Check for duplicates
?distinct()
length(duplicated(missing$TRAP_Code)) # 2158 points to duplication## [1] 2158
missing <- missing[!duplicated(missing$TRAP_Code),] #620 undetected moundsSo we have 620 missing mounds. How many grids are these located within?
# How many grids are the missing mounds in?
missing %>%
st_drop_geometry() %>%
group_by(X1) %>%
tally() %>% arrange(desc(n))## # A tibble: 259 x 2
## X1 n
## <dbl> <int>
## 1 7551 28
## 2 9209 15
## 3 7366 13
## 4 9766 12
## 5 7367 11
## 6 7552 11
## 7 7934 10
## 8 8291 10
## 9 8474 10
## 10 8477 10
## # ... with 249 more rows
# The missing 620 mounds are contained within 259 grid cells of the satellite image
# Let's see the distribution of missing mounds per grid cell
missing %>%
group_by(X1) %>%
count() %>%
ggplot()+ geom_histogram(aes(n))+ggtitle("Count of undetected mounds (620) per gridcell(n=259)") + theme_bw()# some gridcells contain up to 28 undetected mounds. No surprise with the NW necropolisUndetected mounds appear in 259 gridcells, which contain between 1 and 29 mounds, heavily skewed to the left.
What is the probability of the cells that contain undetected mounds?
# What is the mound_probability in these gridcells?
hist(missing$mound_probability, main = "Probability of gridcells containing undetected mounds")Somehow 5 of the undetected mounds still have high mound-probability!? How have these snuck through? Maybe they are outside the study area?
Probability in five undetected mounds is showing to be over 0.59 when it should be below! Let’s visually inspect what is going on and investigate:
# Ggplot of mounds with high prob and grid with high prob
# missing %>%
# filter(mound_probability>0.59) %>%
# ggplot()+
# geom_sf(aes(color = mound_probability))+
# geom_sf(data = cnn_grid, alpha = 0.5)
library(mapview)
missing %>%
filter(mound_probability>0.59) %>%
mapview(color = "red")+
# mapview(cnn_grid, alpha = 0.5)+
mapview(cnnall_grid, zcol= "mound_probability")